import sys
sys.path.append('/usr/local/lib/python2.7/dist-packages')
sys.path.append('/Library/Frameworks/Python.framework/Versions/3.9/lib/python3.9/site-packages')
import os.path
import numpy as np
import session_info
import pandas as pd
from pathlib import Path
# EDA
import hvplot
import hvplot.pandas
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
# Display and Settings
from IPython.display import display
pd.options.display.max_columns = None
pd.options.display.max_rows = None
def download_data():
"""
Implement this functionality to download the data file from cloud
Move directory creation from get_samplefile to this module
:return: Status message of successful download
"""
print('Implement data retrieval function')
pass
# Loading the data file
def get_datafile(path, file_name):
# Data Paramenters
ext = '.csv'
file_name = file_name + ext
if not Path(sys.path[0] + '/' + path).exists():
Path(sys.path[0] + '/' + path).mkdir(parents=True, exist_ok=True)
# Add logging base level WARNING (as cloud operations can be costly)
try:
if not os.path.isfile(path+file_name):
download_data()
df = pd.read_csv(path + file_name)
print('Data loaded successfully')
# Add logging level INFO
return df
except FileNotFoundError as e:
print(f'Data File Not Found Error: {e}')
# Add logging base level ERROR
sys.exit(1)
raw_file_name = 'removals'
raw_path = 'data/raw/'
df = get_datafile(raw_path, raw_file_name)
Data loaded successfully
def get_overview(df):
# Overview of data
print(df.info(verbose=True, show_counts=True))
print(' The Shape of the Dataset:',df.shape)
get_overview(df)
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10323 entries, 0 to 10322 Data columns (total 16 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 OPERATOR_CD 10323 non-null object 1 AIRCRAFT_CD 10323 non-null int64 2 FLEET_CD 10323 non-null int64 3 CHAPTER_CD 10323 non-null int64 4 PART_GROUP_CD 10323 non-null object 5 PART_NUMBER 10323 non-null object 6 SERIAL_NUMBER 10323 non-null object 7 INSTALL_DT 10323 non-null object 8 REMOVAL_DT 10323 non-null object 9 REMOVED_BOOL 10323 non-null int64 10 AIRCRAFT_AGE 10323 non-null int64 11 REPETITIVE_FAULT_QT 10323 non-null int64 12 MEL_QTY 7534 non-null float64 13 DELAY_QTY 6187 non-null float64 14 TIME_SINCE_INSTALL_CYCLES 10323 non-null int64 15 TIME_SINCE_NEW_CYCLES 10323 non-null int64 dtypes: float64(2), int64(8), object(6) memory usage: 1.3+ MB None The Shape of the Dataset: (10323, 16)
ASSUMPTION/RESEARCH FACT:
Predictive modeling is affected by missing data; theoretically, the model can still perform better if the missing data is < 25-30. Otherwise, it might give wrong predictions (Although the model's performance will still be subjective to the data it is trained upon). Therefore, only considering the features with more than 70% of data entry.
Cleaning Notes: The data is a mix of textual and numerical data and is mostly cleaned
After performing initial checks, the current data only has empty values, is consistent and has no impossible values and nearly no outliers (based on eda and min/max analysis done in microsoft excel)
def data_cleaning(df):
# Checking if any data feature has null value
skip = True
if df.isnull().values.any():
skip = False
# Getting series of column and corresponding null count
new = df.isnull().sum()
data = pd.DataFrame(new)
length_df = df.shape[0]
data.reset_index(level=0, inplace=True)
data.columns = ['col_name','value']
div = length_df/100
data['perc'] = data['value']/div
data['perc'] = data['perc'].round(1)
print('% of missing values')
print(data)
return data, skip
else:
data = pd.DataFrame()
print('Data values check returned no Null values')
return data, skip
null_value_data, skip = data_cleaning(df)
% of missing values
col_name value perc
0 OPERATOR_CD 0 0.0
1 AIRCRAFT_CD 0 0.0
2 FLEET_CD 0 0.0
3 CHAPTER_CD 0 0.0
4 PART_GROUP_CD 0 0.0
5 PART_NUMBER 0 0.0
6 SERIAL_NUMBER 0 0.0
7 INSTALL_DT 0 0.0
8 REMOVAL_DT 0 0.0
9 REMOVED_BOOL 0 0.0
10 AIRCRAFT_AGE 0 0.0
11 REPETITIVE_FAULT_QT 0 0.0
12 MEL_QTY 2789 27.0
13 DELAY_QTY 4136 40.1
14 TIME_SINCE_INSTALL_CYCLES 0 0.0
15 TIME_SINCE_NEW_CYCLES 0 0.0
# Analysing feature uniqueness and Target Values
def target_analysis(col):
print('__________________________________________________________________________________________')
print('TARGET DISTRIBUTION')
print(df[col].value_counts(dropna= False))
df[col].value_counts(dropna = False).plot(kind = 'bar', color = ['c', 'darkorange'],
figsize = (10,5),title='Target Distribution')
plt.show()
def feature_analysis(df, col):
print('DATA FEATURES UNIQUENESS')
print('__________________________________________________________________________________________')
cols = df.columns.values.tolist()
for item in cols:
print(item)
uniq = len(df[item].unique())
print(f'Total of unique values {uniq}')
if uniq <=10:
print(df[item].value_counts(dropna= False))
# Target Column
target_analysis(col)
target_col = 'REMOVED_BOOL'
feature_analysis(df, target_col)
DATA FEATURES UNIQUENESS __________________________________________________________________________________________ OPERATOR_CD Total of unique values 2 JUE 5376 VST 4947 Name: OPERATOR_CD, dtype: int64 AIRCRAFT_CD Total of unique values 10 763 1576 784 1472 955 1361 478 1317 835 1177 479 1092 527 965 915 830 925 499 708 34 Name: AIRCRAFT_CD, dtype: int64 FLEET_CD Total of unique values 2 737 8029 787 2294 Name: FLEET_CD, dtype: int64 CHAPTER_CD Total of unique values 39 PART_GROUP_CD Total of unique values 732 PART_NUMBER Total of unique values 934 SERIAL_NUMBER Total of unique values 6734 INSTALL_DT Total of unique values 7426 REMOVAL_DT Total of unique values 1572 REMOVED_BOOL Total of unique values 2 1 8182 0 2141 Name: REMOVED_BOOL, dtype: int64 AIRCRAFT_AGE Total of unique values 19 REPETITIVE_FAULT_QT Total of unique values 51 MEL_QTY Total of unique values 721 DELAY_QTY Total of unique values 121 TIME_SINCE_INSTALL_CYCLES Total of unique values 1912 TIME_SINCE_NEW_CYCLES Total of unique values 3407 __________________________________________________________________________________________ TARGET DISTRIBUTION 1 8182 0 2141 Name: REMOVED_BOOL, dtype: int64
Reasons for irrelevance of following column
def feature_selection(df, data, remove_col, skip = True, selection_threshold = 30):
if not skip:
# Separating relevant and not compliant features
removed_col = data.loc[data['perc'] > selection_threshold, 'col_name']
relevant_col = data.loc[data['perc'] < selection_threshold, 'col_name']
print(f' {removed_col.shape[0]} feature(s) removed due to null entries')
print(removed_col)
print('__________________________________________________________________________________________')
cols_of_interest_list = relevant_col.tolist()
else:
# Selecting all columns if no null value columns to remove
cols_of_interest_list = df.columns.values
cols_of_interest_list = list(set(cols_of_interest_list) - set(remove_col))
print(f' {len(cols_of_interest_list)} features of interest')
df = df[cols_of_interest_list]
print(f'Selected Data Shape: {df.shape}')
return df
# Define the selection criteria for null value columns (currently <30%)
selection_threshold = 30
#Columns to drop based on feature selection (analysis)
remove_col = ['PART_GROUP_CD', 'PART_NUMBER', 'DELAY_OTY', 'CHAPTER_CD']
df = feature_selection(df, null_value_data, remove_col, skip, selection_threshold)
1 feature(s) removed due to null entries 13 DELAY_QTY Name: col_name, dtype: object __________________________________________________________________________________________ 12 features of interest Selected Data Shape: (10323, 12)
Type of analysis done -
Inference from EDA is provided at end of subpoint 6 (After plots)
def num_col_analysis(df):
return df.describe().transpose()
# Numerical column analysis
num_col_analysis(df)
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| TIME_SINCE_NEW_CYCLES | 10323.0 | 2024.123607 | 5018.069204 | -155.0 | 0.0 | 584.0 | 1896.0 | 227612.0 |
| REMOVED_BOOL | 10323.0 | 0.792599 | 0.405465 | 0.0 | 1.0 | 1.0 | 1.0 | 1.0 |
| FLEET_CD | 10323.0 | 748.111111 | 20.787992 | 737.0 | 737.0 | 737.0 | 737.0 | 787.0 |
| AIRCRAFT_CD | 10323.0 | 730.924344 | 178.117204 | 478.0 | 527.0 | 784.0 | 915.0 | 955.0 |
| MEL_QTY | 7534.0 | 165.974648 | 211.939339 | 1.0 | 19.0 | 69.0 | 235.0 | 1050.0 |
| AIRCRAFT_AGE | 10323.0 | 18.201782 | 3.647077 | 10.0 | 16.0 | 18.0 | 20.0 | 28.0 |
| TIME_SINCE_INSTALL_CYCLES | 10323.0 | 574.549356 | 1132.307202 | -163.0 | 0.0 | 133.0 | 729.0 | 31713.0 |
| REPETITIVE_FAULT_QT | 10323.0 | 1.993703 | 6.460128 | 0.0 | 0.0 | 0.0 | 2.0 | 79.0 |
def cat_col_analysis(df):
return df.describe( include= ['object']).transpose()
# Textual column analysis
cat_col_analysis(df)
| count | unique | top | freq | |
|---|---|---|---|---|
| OPERATOR_CD | 10323 | 2 | JUE | 5376 |
| INSTALL_DT | 10323 | 7426 | 9/17/2016 | 27 |
| SERIAL_NUMBER | 10323 | 6734 | SKI33SD | 21 |
| REMOVAL_DT | 10323 | 1572 | 9/30/2020 | 2141 |
def missing_value_eda(df):
# Checking missing values in data
msno.bar(df, color='c', figsize=(12, 7))
plt.show()
# Vizualization of missing values as Heatmap
plt.figure(figsize = (12,7))
sns.heatmap(df.isnull(), cmap = 'Wistia', cbar=False, yticklabels=False)
plt.title('Missing value heatmap',fontsize =15)
plt.show()
missing_value_eda(df)
def correlation_analysis(df, annot=True):
plt.figure(figsize=(15,8))
sns.heatmap(df.corr(),annot=annot, cmap='viridis')
plt.title('Correlation matrix of Numerical Data',fontsize =15)
plt.show()
correlation_analysis(df)
# Histogram of data features
print('Distribution of Aircraft Age with Target')
df.hvplot.hist(y='AIRCRAFT_AGE', by='REMOVED_BOOL',
subplots=False, width=800, height=400, bins=50, alpha=0.4)
Distribution of Aircraft Age with Target
def distribution_plot_eda(df, col, kde = True, rug = False):
for item in col:
title = str(item)+ ' Distribution'
sns.displot(df[item], height = 7, kde = kde, rug = rug)
plt.title(title,fontsize =10)
plt.show()
col_list = ['TIME_SINCE_INSTALL_CYCLES', 'TIME_SINCE_NEW_CYCLES']
distribution_plot_eda(df, col_list)
def histogram_plot_eda(df, item, target, subplots=True, width=800, height=400):
return df.hvplot.box(y=item, subplots=subplots, by=target, width=width, height=height)
target = 'REMOVED_BOOL'
histogram_plot_eda(df, col_list[0], target)
histogram_plot_eda(df, col_list[1], target)
def sub_plots_eda(df, col):
palette = ['#46C1E5','#FF7070']
plt.figure(figsize=(15,20))
plt.subplot(4,2,1)
sns.countplot(x = df[col[0]], palette=palette)
plt.subplot(4,2,2)
sns.countplot(x = df[col[1]], palette=palette)
plt.subplot(4,2,3)
sns.countplot(x = df[col[2]], palette=palette)
plt.subplot(4,2,4)
g=sns.countplot(x = df[col[3]], palette=palette)
g.set_xticklabels(g.get_xticklabels(), rotation=90)
plt.show()
col_list = ['FLEET_CD', 'OPERATOR_CD', 'AIRCRAFT_AGE', 'REPETITIVE_FAULT_QT']
sub_plots_eda(df, col_list)
drop_col = ['TIME_SINCE_NEW_CYCLES', 'SERIAL_NUMBER']
for item in drop_col:
df.drop([item], axis=1, inplace=True)
The Dataset Deals with numerical (float/int) and object type data (textual, date). Feature engineering is to convert the data to a uniform numerical format for the model to learn and predict better. Combined type data works well with RNN and other Deep NN.
Type Conversions
Conversion, from text to numerical values using dummy creation, similar to one hot encoding
Handling missing/empty values in the data
def get_imputing_lists(df):
mean_features = []
mode_features = []
all_features = list(df.columns.values)
for item in all_features:
if df[item].isnull().sum().astype(np.int32) > 0:
if df[item].dtype == np.float64:
mean_features.append(item)
else:
mode_features.append(item)
print(f'Numerical features list {mean_features}')
print(f'Categorical features list {mode_features}')
return mean_features, mode_features
mean_features, mode_features = get_imputing_lists(df)
Numerical features list ['MEL_QTY'] Categorical features list []
def num_imputations(df, mean_features):
if mean_features:
print('Imputing Numerical values')
imputer_mean = SimpleImputer() #mean imputation
mean_df = pd.DataFrame(data = imputer_mean.fit_transform(df[mean_features]), columns = mean_features)
df.drop(mean_features, axis = 1, inplace =True)
df = pd.concat([df,mean_df],axis =1)
return df
def cat_imputations(df, mode_features):
if mode_features:
print('Imputing Categorical values')
for item in mode_features:
df[item].fillna(df[item].mode()[0], inplace = True)
return df
df = num_imputations(df, mean_features)
df = num_imputations(df, mode_features)
print('Rechecking null values')
df.isnull().values.any()
Imputing Numerical values Rechecking null values
False
def date_time_conversion(df):
# Relevant date time columns
cols = ['INSTALL_DT', 'REMOVAL_DT', 'REMOVED_BOOL', 'AIRCRAFT_AGE']
d_df = df[cols]
# Splitting Install_DT into temporary date and time columns
d_df[['A', 'B']] = d_df['INSTALL_DT'].str.split(' ', 1, expand=True)
# Setting consistent format
d_df['INSTALL_DT'] = pd.to_datetime(d_df['A'], format='%m/%d/%Y')
d_df['REMOVAL_DT'] = pd.to_datetime(d_df['REMOVAL_DT'], format='%m/%d/%Y')
# Calculating Component install life in days
d_df['COMPONENT_LIFE_DAYS'] = (d_df['REMOVAL_DT'] - d_df['INSTALL_DT']).dt.days
cols = ['REMOVED_BOOL', 'COMPONENT_LIFE_DAYS', 'AIRCRAFT_AGE']
d_df = d_df[cols]
#Evaluating mean of component life against target variable
means = d_df.groupby('REMOVED_BOOL').mean().reset_index()
val = means.loc[means['REMOVED_BOOL'] == 1, 'COMPONENT_LIFE_DAYS']
print(f'Mean of component life with Target:\n {means}')
val = val.tolist()
# Data shows high unexpected component life for all Installed (0) components
# Streamlining age of aircraft and component life to values in days
d_df['AIRCRAFT_AGE'] = d_df['AIRCRAFT_AGE'].apply(lambda x: x*365)
print(d_df.head())
cols = ['REMOVED_BOOL', 'AIRCRAFT_AGE', 'INSTALL_DT', 'REMOVAL_DT']
# Merging with original dataframe
df.drop(cols, axis = 1, inplace =True)
df = pd.concat([df,d_df],axis =1)
return df
df = date_time_conversion(df)
Mean of component life with Target:
REMOVED_BOOL COMPONENT_LIFE_DAYS AIRCRAFT_AGE
0 0 1184.885567 18.382532
1 1 156.482645 18.154485
REMOVED_BOOL COMPONENT_LIFE_DAYS AIRCRAFT_AGE
0 1 0 8030
1 1 848 5840
2 1 2 6570
3 1 15 6935
4 1 1 7300
/usr/local/Cellar/jupyterlab/3.1.10/libexec/lib/python3.9/site-packages/pandas/core/frame.py:3641: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy self[k1] = value[k2] /var/folders/3f/3ft5mtys7rv055x35yb8dqph0000gn/T/ipykernel_4941/1579903364.py:8: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy d_df['INSTALL_DT'] = pd.to_datetime(d_df['A'], format='%m/%d/%Y') /var/folders/3f/3ft5mtys7rv055x35yb8dqph0000gn/T/ipykernel_4941/1579903364.py:9: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy d_df['REMOVAL_DT'] = pd.to_datetime(d_df['REMOVAL_DT'], format='%m/%d/%Y') /var/folders/3f/3ft5mtys7rv055x35yb8dqph0000gn/T/ipykernel_4941/1579903364.py:12: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy d_df['COMPONENT_LIFE_DAYS'] = (d_df['REMOVAL_DT'] - d_df['INSTALL_DT']).dt.days
def generate_dummy_col(df, col):
if col:
df = pd.get_dummies(df, columns=col)
print(df.head())
return df
#dummy_col = ['OPERATOR_CD','FLEET_CD']
# Not creating dummies as it resulted in high correlation in CoxPHFitter model training
dummy_col = []
df = generate_dummy_col(df, dummy_col)
print(df.info(verbose=True, show_counts=True))
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10323 entries, 0 to 10322 Data columns (total 9 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 FLEET_CD 10323 non-null int64 1 AIRCRAFT_CD 10323 non-null int64 2 OPERATOR_CD 10323 non-null object 3 TIME_SINCE_INSTALL_CYCLES 10323 non-null int64 4 REPETITIVE_FAULT_QT 10323 non-null int64 5 MEL_QTY 10323 non-null float64 6 REMOVED_BOOL 10323 non-null int64 7 COMPONENT_LIFE_DAYS 10323 non-null int64 8 AIRCRAFT_AGE 10323 non-null int64 dtypes: float64(1), int64(7), object(1) memory usage: 726.0+ KB None
def save_data(df, path, file_name):
if not Path(sys.path[0] + '/' + path).exists():
Path(sys.path[0] + '/' + path).mkdir(parents=True, exist_ok=True)
# Add logging base level WARNING (as cloud operations can be costly)
ext = '.csv'
file_name = file_name + ext
df.to_csv(path+file_name, index = False)
print(f'{file_name} file created')
processed_file_name = 'filtered_data'
processed_path = 'data/processed/'
save_data(df, processed_path, processed_file_name)
filtered_data.csv file created
df = get_datafile(processed_path, processed_file_name)
print(df.head())
Data loaded successfully FLEET_CD AIRCRAFT_CD OPERATOR_CD TIME_SINCE_INSTALL_CYCLES \ 0 737 479 VST 0 1 737 478 VST 2325 2 737 478 VST 8 3 737 479 VST 24 4 737 479 VST 6 REPETITIVE_FAULT_QT MEL_QTY REMOVED_BOOL COMPONENT_LIFE_DAYS \ 0 0 165.974648 1 0 1 9 366.000000 1 848 2 1 1.000000 1 2 3 2 7.000000 1 15 4 1 165.974648 1 1 AIRCRAFT_AGE 0 8030 1 5840 2 6570 3 6935 4 7300
session_info.show()
----- holoviews 1.14.6 hvplot 0.7.3 matplotlib 3.4.3 missingno 0.5.0 numpy 1.21.3 pandas 1.3.4 seaborn 0.11.2 session_info 1.0.0 sklearn 1.0.1 -----
PIL 8.4.0 anyio NA appnope 0.1.2 attr 21.2.0 babel 2.9.1 backcall 0.2.0 beta_ufunc NA binom_ufunc NA bleach 4.1.0 bokeh 2.4.2 certifi 2021.05.30 cffi 1.14.6 charset_normalizer 2.0.4 colorcet 3.0.0 cycler 0.10.0 cython_runtime NA dateutil 2.8.2 debugpy 1.4.1 decorator 5.0.9 defusedxml 0.7.1 entrypoints 0.3 idna 3.2 ipykernel 6.3.1 ipython_genutils 0.2.0 jedi 0.18.0 jinja2 3.0.1 joblib 1.1.0 json5 NA jsonschema 3.2.0 jupyter_server 1.10.2 jupyterlab_pygments 0.1.2 jupyterlab_server 2.7.2 kiwisolver 1.3.2 markupsafe 2.0.1 matplotlib_inline NA mistune 0.8.4 mpl_toolkits NA nbclassic NA nbclient 0.5.4 nbconvert 6.1.0 nbformat 5.1.3 nbinom_ufunc NA packaging 21.0 pandocfilters NA panel 0.12.6 param 1.12.0 parso 0.8.2 pexpect 4.8.0 pickleshare 0.7.5 pkg_resources NA prometheus_client NA prompt_toolkit 3.0.20 ptyprocess 0.7.0 pvectorc NA pydev_ipython NA pydevconsole NA pydevd 2.5.0 pydevd_concurrency_analyser NA pydevd_file_utils NA pydevd_plugins NA pydevd_tracing NA pyexpat NA pygments 2.10.0 pyparsing 2.4.7 pyrsistent NA pytz 2021.1 pyviz_comms 2.1.0 requests 2.26.0 scipy 1.7.1 send2trash NA sitecustomize NA six 1.16.0 sniffio 1.2.0 storemagic NA terminado 0.11.1 testpath 0.5.0 threadpoolctl 3.0.0 tornado 6.1 tqdm 4.62.3 traitlets 5.1.0 typing_extensions NA uritemplate 3.0.1 urllib3 1.26.6 wcwidth 0.2.5 webencodings 0.5.1 websocket 1.2.1 yaml 6.0 zmq 22.2.1
----- IPython 7.27.0 jupyter_client 7.0.2 jupyter_core 4.7.1 jupyterlab 3.1.10 notebook 6.4.3 ----- Python 3.9.7 (default, Sep 3 2021, 12:37:55) [Clang 12.0.5 (clang-1205.0.22.9)] macOS-12.0.1-x86_64-i386-64bit ----- Session information updated at 2021-12-21 00:04